Differential expression using the bulk RNAseq data from the May 2024 release of JAX_RNAseq2_ExtraEmbryonic.
There are two model systems involved in this dataset, ExM & PrS.
There are only normoxia samples involved in this dataset.
There is only one day value.
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)
library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())
path = "../../MorPhiC/data/May-2024/JAX_RNAseq2_ExtraEmbryonic/"
dat_path = paste0(path, "processed/release110-gencode44/Tables")meta = fread("data/May_2024/JAX_RNAseq2_ExtraEmbryonic_meta_data.tsv",
data.table = FALSE)
dim(meta)## [1] 82 36
meta[1:2,]## clonal.label library_preparation.label label
## 1 MOK20011-WT1 GT23-13719 PrS-MOK20011-WT1
## 2 MOK20011-WT2 GT23-13720 PrS-MOK20011-WT2
## description differentiated_product_protocol_id
## 1 KOLF2.2 derived primitive syncytium JAXPD001
## 2 KOLF2.2 derived primitive syncytium JAXPD001
## model_system timepoint_value timepoint_unit final_timepoint
## 1 PrS 6 days Yes
## 2 PrS 6 days Yes
## treatment_condition wt_control_status ko_strategy ko_gene
## 1 Not applicable WT WT WT
## 2 Not applicable WT WT WT
## library_preparation.description
## 1 NA
## 2 NA
## library_preparation.library_preparation_protocol_id
## 1 JAXPL001
## 2 JAXPL001
## library_preparation.average_fragment_size
## 1 417
## 2 402
## library_preparation.input_amount_value library_preparation.input_amount_unit
## 1 300 ng
## 2 300 ng
## library_preparation.concentration_value
## 1 34.51784
## 2 17.18679
## library_preparation.concentration_unit library_preparation.total_yield_value
## 1 nM 190.0
## 2 nM 91.2
## library_preparation.total_yield_unit library_preparation.cdna_pcr_cycles
## 1 ng 10
## 2 ng 10
## library_preparation.pcr_cycles_for_indexing
## 1 Not available
## 2 Not available
## file_id run_id
## 1 RUNX1_WT1_GT23-13719_CCAAGTCT-TCATCCTT_S170_L003 20231124_23-robson-013
## 2 RUNX1_WT2_GT23-13720_TTGGACTC-CTGCTTCC_S157_L003 20231124_23-robson-013
## clonal.description clonal.parental_name clonal.clone_id clonal.type
## 1 KOLF2.2 KOLF2.2J WT1 iPSC
## 2 KOLF2.2 KOLF2.2J WT2 iPSC
## clonal.zygosity clonal.cell_line_generation_protocol
## 1 Not applicable Not applicable
## 2 Not applicable Not applicable
## clonal.treatment_condition clonal.wt_control_status
## 1 Not applicable WT
## 2 Not applicable WT
## expression_alteration.label model_organ
## 1 Not applicable extra-embryonic primitive syncytial cells
## 2 Not applicable extra-embryonic primitive syncytial cells
names(meta)## [1] "clonal.label"
## [2] "library_preparation.label"
## [3] "label"
## [4] "description"
## [5] "differentiated_product_protocol_id"
## [6] "model_system"
## [7] "timepoint_value"
## [8] "timepoint_unit"
## [9] "final_timepoint"
## [10] "treatment_condition"
## [11] "wt_control_status"
## [12] "ko_strategy"
## [13] "ko_gene"
## [14] "library_preparation.description"
## [15] "library_preparation.library_preparation_protocol_id"
## [16] "library_preparation.average_fragment_size"
## [17] "library_preparation.input_amount_value"
## [18] "library_preparation.input_amount_unit"
## [19] "library_preparation.concentration_value"
## [20] "library_preparation.concentration_unit"
## [21] "library_preparation.total_yield_value"
## [22] "library_preparation.total_yield_unit"
## [23] "library_preparation.cdna_pcr_cycles"
## [24] "library_preparation.pcr_cycles_for_indexing"
## [25] "file_id"
## [26] "run_id"
## [27] "clonal.description"
## [28] "clonal.parental_name"
## [29] "clonal.clone_id"
## [30] "clonal.type"
## [31] "clonal.zygosity"
## [32] "clonal.cell_line_generation_protocol"
## [33] "clonal.treatment_condition"
## [34] "clonal.wt_control_status"
## [35] "expression_alteration.label"
## [36] "model_organ"
table(table(meta$clonal.label))##
## 1
## 82
table(table(meta$library_preparation.label))##
## 1
## 82
table(table(meta$label))##
## 1
## 82
cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)## [1] 38592 83
cts[1:2, c(1:2, (ncol(cts)-1):ncol(cts))]## Name MEIS2_CE_A06_GT23-12174_TGCGAGAC-CAACAATG_S1_L001
## 1 ENSG00000268674 0
## 2 ENSG00000271254 793
## MEF2C_PTC_B03_GT23-13717_TCTCTACT-GAACCGCG_S177_L003
## 1 0
## 2 937
## RUNX1_KO_G07_GT23-13724_CGGCGTGA-ACAGGCGC_S175_L003
## 1 0
## 2 1079
stopifnot(setequal(names(cts)[-1], meta$file_id))
meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)##
## TRUE
## 82
cts_dat = data.matrix(cts[,-1])
rownames(cts_dat) = cts$Name
table(meta$timepoint_value, useNA="ifany")##
## 6
## 82
meta$timepoint = as.character(meta$timepoint_value)gene_anno = fread("data/gencode_v44_primary_assembly_info.tsv")
dim(gene_anno)## [1] 62754 8
gene_anno[1:2,]## geneId chr strand start end ensembl_gene_id
## <char> <char> <char> <int> <int> <char>
## 1: ENSG00000000003.16 chrX - 100627108 100639991 ENSG00000000003
## 2: ENSG00000000005.6 chrX + 100584936 100599885 ENSG00000000005
## hgnc_symbol description
## <char> <char>
## 1: TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2: TNMD tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
table(rownames(cts_dat) %in% gene_anno$ensembl_gene_id)##
## TRUE
## 38592
# all ensembl gene IDs are contained in the annotation
setdiff(rownames(cts_dat), gene_anno$ensembl_gene_id)## character(0)
model_s = meta$model_system
table(model_s, useNA="ifany")## model_s
## ExM PrS
## 34 48
get_q75 <- function(x){quantile(x, probs = 0.75)}
gene_info = data.frame(Name = cts$Name,
t(apply(cts_dat, 1, tapply, model_s, min)),
t(apply(cts_dat, 1, tapply, model_s, median)),
t(apply(cts_dat, 1, tapply, model_s, get_q75)),
t(apply(cts_dat, 1, tapply, model_s, max)))
dim(gene_info)## [1] 38592 9
gene_info[1:2,]## Name ExM PrS ExM.1 PrS.1 ExM.2 PrS.2 ExM.3 PrS.3
## ENSG00000268674 ENSG00000268674 0 0 0 0.0 0 0.00 0 1
## ENSG00000271254 ENSG00000271254 594 512 797 958.5 858 1051.75 1079 1695
table(row.names(gene_info)==gene_info$Name, useNA="ifany")##
## TRUE
## 38592
names(gene_info)[2:9] = paste0(rep(c("ExM_", "PrS_"), times=4),
rep(c("min", "median", "q75", "max"), each=2))
dim(gene_info)## [1] 38592 9
gene_info[1:2,]## Name ExM_min PrS_min ExM_median PrS_median ExM_q75
## ENSG00000268674 ENSG00000268674 0 0 0 0.0 0
## ENSG00000271254 ENSG00000271254 594 512 797 958.5 858
## PrS_q75 ExM_max PrS_max
## ENSG00000268674 0.00 0 1
## ENSG00000271254 1051.75 1079 1695
summary(gene_info[,-1])## ExM_min PrS_min ExM_median PrS_median
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 2.0 Median : 1.5
## Mean : 323.5 Mean : 335.2 Mean : 593.2 Mean : 647.1
## 3rd Qu.: 100.0 3rd Qu.: 71.0 3rd Qu.: 217.0 3rd Qu.: 177.5
## Max. :240570.0 Max. :209625.0 Max. :883830.0 Max. :382389.5
## ExM_q75 PrS_q75 ExM_max PrS_max
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 1.0 1st Qu.: 1
## Median : 3.8 Median : 3.0 Median : 8.0 Median : 8
## Mean : 731.5 Mean : 772.1 Mean : 1063.6 Mean : 1188
## 3rd Qu.: 279.0 3rd Qu.: 217.6 3rd Qu.: 400.2 3rd Qu.: 330
## Max. :1028917.5 Max. :454140.8 Max. :1675470.0 Max. :773629
table(gene_info$ExM_q75 > 0, gene_info$PrS_q75 > 0)##
## FALSE TRUE
## FALSE 12951 1098
## TRUE 1885 22658
w2kp = which(gene_info$ExM_q75 > 0 | gene_info$PrS_q75 > 0)
cts_dat = cts_dat[w2kp,]
dim(cts_dat)## [1] 25641 82
gene_info = gene_info[w2kp,]
meta$read_depth = colSums(cts_dat)/1e6
meta$q75 = apply(cts_dat, 2, quantile, probs = 0.75)
ggplot(meta, aes(x=read_depth, y=q75, color=ko_strategy, shape = model_system)) +
geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") +
scale_shape_manual(values = c(7, 16)) +
xlab("read depth (million)") + ylab("75 percentile of gene expression") +
labs(color = "KO type", shape = "Model") +
scale_color_brewer(palette = "Set1")meta$run_id_short = str_replace(meta$run_id, "^[^-]*-", "")
ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short, shape = model_system)) +
geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") +
scale_shape_manual(values = c(7, 16)) +
xlab("read depth (million)") + ylab("75 percentile of gene expression") +
labs(color = "Run ID", shape = "Model") +
scale_color_brewer(palette = "Set1")sample2kp = which(meta$ko_gene == "WT")
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
summary(meta_m$q75)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 428.0 510.0 554.0 574.9 649.0 727.0
dim(cts_dat_m)## [1] 25641 21
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)## [1] 21 24263
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)## [1] "sdev" "rotation" "center" "scale" "x"
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)## [1] 0.47564883 0.12518353 0.07086713 0.05336636 0.02717944
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs## [1] 47.6 12.5 7.1 5.3 2.7
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=model_system,
color=run_id_short)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
labs(color="Run id", shape ="Model") +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle("Wild type samples") +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)## Generate sample information matrix
cols2kp = c("model_system", "q75")
sample_info = meta_m[,cols2kp]
dim(sample_info)## [1] 21 2
sample_info[1:2,]## model_system q75
## 59 PrS 554
## 25 PrS 428
sample_info$log_q75 = log(sample_info$q75)
dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info,
~ model_system + log_q75)
dds = DESeq(dds)
## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)## [1] 25641 6
res_rd[1:2,]## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000271254 944.5427577 0.1298778 0.3327253 0.3903454 0.6962812
## ENSG00000276345 0.3686712 -5.3837662 9.7887890 -0.5499931 0.5823241
## padj
## ENSG00000271254 0.8840768
## ENSG00000276345 NA
table(is.na(res_rd$padj))##
## FALSE TRUE
## 14702 10939
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("WT Read depth")
print(g0)## association with model_system
dds_lrt = DESeq(dds, test="LRT", reduced = ~ log_q75)
res_lrt = results(dds_lrt)
res_model_system = as.data.frame(res_lrt)
dim(res_model_system)## [1] 25641 6
res_model_system[1:2,]## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000271254 944.5427577 0.1298778 0.3327253 31.92267820 1.604331e-08
## ENSG00000276345 0.3686712 -5.3837662 9.7887890 -0.01029894 1.000000e+00
## padj
## ENSG00000271254 4.7729e-08
## ENSG00000276345 1.0000e+00
table(is.na(res_model_system$padj))##
## FALSE TRUE
## 24642 999
g0 = ggplot(res_model_system %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("WT Model system")
print(g0)Explore the PC plots first, to decide running the models on what level of DE group.
Then run the DE analysis.
table(meta$run_id, meta$model_system)##
## ExM PrS
## 20230809_23-robson-007-run2 0 12
## 20231004_23-robson-011 22 0
## 20231124_23-robson-013 12 12
## 20240307_24-robson-002 0 12
## 20240307_24-robson-005 0 12
table(meta$run_id, meta$ko_gene)##
## BHLHE40 MEF2C MEIS1 MEIS2 MXD1 NCOA3 RUNX1 WT
## 20230809_23-robson-007-run2 0 0 0 0 9 0 0 3
## 20231004_23-robson-011 0 0 9 7 0 0 0 6
## 20231124_23-robson-013 0 9 0 0 0 0 9 6
## 20240307_24-robson-002 9 0 0 0 0 0 0 3
## 20240307_24-robson-005 0 0 0 0 0 9 0 3
table(meta$run_id, meta$model_system, meta$ko_gene)## , , = BHLHE40
##
##
## ExM PrS
## 20230809_23-robson-007-run2 0 0
## 20231004_23-robson-011 0 0
## 20231124_23-robson-013 0 0
## 20240307_24-robson-002 0 9
## 20240307_24-robson-005 0 0
##
## , , = MEF2C
##
##
## ExM PrS
## 20230809_23-robson-007-run2 0 0
## 20231004_23-robson-011 0 0
## 20231124_23-robson-013 9 0
## 20240307_24-robson-002 0 0
## 20240307_24-robson-005 0 0
##
## , , = MEIS1
##
##
## ExM PrS
## 20230809_23-robson-007-run2 0 0
## 20231004_23-robson-011 9 0
## 20231124_23-robson-013 0 0
## 20240307_24-robson-002 0 0
## 20240307_24-robson-005 0 0
##
## , , = MEIS2
##
##
## ExM PrS
## 20230809_23-robson-007-run2 0 0
## 20231004_23-robson-011 7 0
## 20231124_23-robson-013 0 0
## 20240307_24-robson-002 0 0
## 20240307_24-robson-005 0 0
##
## , , = MXD1
##
##
## ExM PrS
## 20230809_23-robson-007-run2 0 9
## 20231004_23-robson-011 0 0
## 20231124_23-robson-013 0 0
## 20240307_24-robson-002 0 0
## 20240307_24-robson-005 0 0
##
## , , = NCOA3
##
##
## ExM PrS
## 20230809_23-robson-007-run2 0 0
## 20231004_23-robson-011 0 0
## 20231124_23-robson-013 0 0
## 20240307_24-robson-002 0 0
## 20240307_24-robson-005 0 9
##
## , , = RUNX1
##
##
## ExM PrS
## 20230809_23-robson-007-run2 0 0
## 20231004_23-robson-011 0 0
## 20231124_23-robson-013 0 9
## 20240307_24-robson-002 0 0
## 20240307_24-robson-005 0 0
##
## , , = WT
##
##
## ExM PrS
## 20230809_23-robson-007-run2 0 3
## 20231004_23-robson-011 6 0
## 20231124_23-robson-013 3 3
## 20240307_24-robson-002 0 3
## 20240307_24-robson-005 0 3
table(meta$ko_strategy, meta$ko_gene)##
## BHLHE40 MEF2C MEIS1 MEIS2 MXD1 NCOA3 RUNX1 WT
## CE 3 3 3 3 3 3 3 0
## KO 3 3 3 1 3 3 3 0
## PTC 3 3 3 3 3 3 3 0
## WT 0 0 0 0 0 0 0 21
unique_model_ss = unique(meta$model_system)
unique_model_ss = sort(unique_model_ss)
for(model1 in unique_model_ss){
print(model1)
sample2kp = which(meta$model_system == model1)
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
print(table(meta_m$ko_strategy, str_extract(colnames(cts_dat_m), "^[^_]+_[^_]+")))
extracted_ko_strings = str_extract_all(colnames(cts_dat_m), "CE|KO|PTC|WT")
print(table(sapply(extracted_ko_strings, length)))
print(table(meta_m$ko_strategy, unlist(extracted_ko_strings)))
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_strategy, color=ko_gene)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(model1) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(model1) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene,
alpha = ifelse(ko_gene == "WT", 1, 0.8))) +
geom_point(size=2.5) +
scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(model1) + guides(alpha = "none") +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
table(meta_m$run_id, meta_m$ko_gene)
}## [1] "ExM"
##
## MEF2C_CE MEF2C_KO MEF2C_PTC MEF2C_WT1 MEF2C_WT2 MEF2C_WT3 MEIS1_CE
## CE 3 0 0 0 0 0 3
## KO 0 3 0 0 0 0 0
## PTC 0 0 3 0 0 0 0
## WT 0 0 0 1 1 1 0
##
## MEIS1_KO MEIS1_PTC MEIS1_WT MEIS2_CE MEIS2_KO MEIS2_PTC MEIS2_WT
## CE 0 0 0 3 0 0 0
## KO 3 0 0 0 1 0 0
## PTC 0 3 0 0 0 3 0
## WT 0 0 3 0 0 0 3
##
## 1
## 34
##
## CE KO PTC WT
## CE 9 0 0 0
## KO 0 7 0 0
## PTC 0 0 9 0
## WT 0 0 0 9
## [1] "PrS"
##
## BMLHE40_CE BMLHE40_KO BMLHE40_PTC BMLHE40_WT MDX1_CE MDX1_KO MDX1_KOLF2
## CE 3 0 0 0 3 0 0
## KO 0 3 0 0 0 3 0
## PTC 0 0 3 0 0 0 0
## WT 0 0 0 3 0 0 3
##
## MDX1_PTC NC0A3_CE NC0A3_KO NC0A3_PTC NC0A3_WT RUNX1_CE RUNX1_KO RUNX1_PTC
## CE 0 3 0 0 0 3 0 0
## KO 0 0 3 0 0 0 3 0
## PTC 3 0 0 3 0 0 0 3
## WT 0 0 0 0 3 0 0 0
##
## RUNX1_WT1 RUNX1_WT2 RUNX1_WT3
## CE 0 0 0
## KO 0 0 0
## PTC 0 0 0
## WT 1 1 1
##
## 1
## 48
##
## CE KO PTC WT
## CE 12 0 0 0
## KO 0 12 0 0
## PTC 0 0 12 0
## WT 0 3 0 9
Under model system “Exm”, the batch effect caused by run id seems to be big. Run DE analysis for different run ids separately.
Under model system “PrS”, still run analysis together.
For the output files, create two versions, one with direct information, one with padj set to NA for the genes when the number of 0 per test is >=4.
In more details, for both versions of the outputs, make it one file per knockout gene per DE group (DE group can be model system, model system + day + condition, model system + day + run_id, model system + day, etc.).
The first version of the output files contains
gene id, gene symbol
and for each knockout strategy:
baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
The second version of the output files contains
gene id, gene symbol, chr, strand, position
and for each knockout strategy:
log2FoldChange, pvalue, padj (set to be NA if the number of 0 per test >= 4),
In addition, save out a separate file of the sample size for each comparison. This needs to be by DE group.
df_size = NULL
release_name = "2024_05_JAX_RNAseq2_ExtraEmbryonic"
output_dir = file.path("results", release_name)
raw_output_dir = file.path(output_dir, "raw")
processed_output_dir = file.path(output_dir, "processed")
if (!file.exists(raw_output_dir)){
dir.create(raw_output_dir, recursive = TRUE)
}
if (!file.exists(processed_output_dir)){
dir.create(processed_output_dir, recursive = TRUE)
}for(model1 in unique_model_ss){
print(model1)
sample2kp = which(meta$model_system == model1)
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
print(table(meta_m$ko_strategy, str_extract(colnames(cts_dat_m), "^[^_]+_[^_]+")))
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
print(table(meta_m$file_id == colnames(cts_dat_m), useNA="ifany"))
## Generate sample information matrix
cols2kp = c("model_system", "ko_strategy", "ko_gene", "run_id", "q75", "timepoint")
sample_info = meta_m[,cols2kp]
colnames(sample_info)[which(colnames(sample_info)=="timepoint")] = "day"
dim(sample_info)
sample_info[1:2,]
print(table(sample_info$ko_strategy, sample_info$ko_gene))
sample_info$ko_group = paste(sample_info$ko_gene,
sample_info$ko_strategy, sep="_")
print(table(sample_info$ko_group))
print(length(unique(sample_info$ko_group)))
print(table(sample_info$run_id, sample_info$ko_group))
sample_info$model_day = paste0(sample_info$model_system,
"_day_",
as.character(sample_info$day))
# run DESeq2 for the two run ids separately for model system ExM
if (model1 == "ExM"){
sample_info$de_grp = paste0(sample_info$model_day, "_", sample_info$run_id)
}else{
sample_info$de_grp = sample_info$model_day
}
sorted_de_grps = sort(unique(sample_info$de_grp))
for (d_group in sorted_de_grps){
dg_index = which(sample_info$de_grp==d_group)
cts_dat_m_dg = cts_dat_m[, dg_index]
sample_info_dg = sample_info[dg_index, ]
stopifnot(all(sample_info_dg$de_grp==d_group))
print("-----------------------------------")
print(paste0("DE analysis group: ", d_group))
if (sum(sample_info_dg$ko_gene=="WT")<2){
print(sprintf("DE analysis group %s has only %d wild type samples",
d_group, sum(sample_info_dg$ko_gene=="WT")))
print(sprintf("do not run DESeq2 for it"))
print("-----------------------------------")
break
}else{
print(sprintf("DE analysis group %s DESeq2 results:", d_group))
print("-----------------------------------")
}
# do two steps of filtering
# first, filter based on q75 of each gene
# second, filter based on whether each gene is expressed in at least 4 cells
dg_q75 = apply(cts_dat_m_dg, 1, quantile, probs=0.75)
cts_dat_m_dg = cts_dat_m_dg[dg_q75 > 0,]
print(sprintf("After filtering by gene expression q75, the number of genes reduces from %d to %d",
length(dg_q75), nrow(cts_dat_m_dg)))
n_pos = rowSums(cts_dat_m_dg>0)
cts_dat_m_dg = cts_dat_m_dg[n_pos >= 4,]
if (length(n_pos)==nrow(cts_dat_m_dg)){
print("After filtering by nonzero expression in at least 4 samples, the number of genes does not change.")
}else{
print(sprintf("After filtering by nonzero expression in at least 4 samples, the number of genes reduces from %d to %d",
length(n_pos), nrow(cts_dat_m_dg)))
}
# update the q75 based on only genes used in the process
sample_info_dg$q75 = apply(cts_dat_m_dg, 2, quantile, probs = 0.75)
sample_info_dg$log_q75 = log(sample_info_dg$q75)
if (length(unique(sample_info_dg$run_id))==1){
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + log_q75)
}else{
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + run_id + log_q75)
}
start.time <- Sys.time()
dds = DESeq(dds)
end.time <- Sys.time()
time.taken <- end.time - start.time
print(time.taken)
## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)
res_rd[1:2,]
table(is.na(res_rd$padj))
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(d_group, " read depth"))
print(g0)
## Rerun the analysis without read-depth if it is not significant for
## most genes, and the number of samples is small
## i.e., proportion of non-null in the distribution is smaller than 0.01
## (this needs to be restricted to the genes with not NA adjusted pvalue)
## or if smaller than 0.1 and the number of samples is not greater than 6
pi_1_rd = max(0, mean(res_rd$pvalue[!is.na(res_rd$padj)] < 0.05) - 0.05)
pi_1_rd = max(pi_1_rd, 0, 1 - 2*mean(res_rd$pvalue[!is.na(res_rd$padj)] > 0.5))
print(sprintf("prop of non-null for rd: %.2e", pi_1_rd))
if(pi_1_rd < 0.01 || (ncol(cts_dat_m_dg) <= 6 && pi_1_rd < 0.1 )){
print("rerun DESeq2 without read depth")
include_rd = 0
if (length(unique(sample_info_dg$run_id))==1){
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group)
}else{
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + run_id)
}
start.time <- Sys.time()
dds = DESeq(dds)
end.time <- Sys.time()
time.taken <- end.time - start.time
print(time.taken)
}else{
include_rd = 1
}
## association with run_id
if (length(unique(sample_info_dg$run_id))>1){
if(include_rd){
dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group + log_q75)
}else{
dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group)
}
res_lrt = results(dds_lrt)
res_run_id = as.data.frame(res_lrt)
dim(res_run_id)
res_run_id[1:2,]
table(is.na(res_run_id$padj))
g0 = ggplot(res_run_id %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(d_group, " Run ID"))
print(g0)
}
## DE analysis for each KO gene
## for knockout gene MEIS2 and knockout strategy KO
## there is only one knockout sample
## need to set the padj for all genes to NA for this setting of comparison
table(sample_info_dg$ko_gene)
table(sample_info_dg$ko_gene, sample_info_dg$ko_strategy)
genos = setdiff(unique(sample_info_dg$ko_gene), "WT")
genos = sort(genos)
genos
ko_grp = c("CE", "KO", "PTC")
for(geno in genos){
res_geno_df = NULL
res_geno_reduced_df = NULL
res_geno = list()
for(k_grp1 in ko_grp){
res_g1 = results(dds, contrast = c("ko_group",
paste0(geno, "_", k_grp1), "WT_WT"))
res_g1 = signif(data.frame(res_g1), 3)
# add a column to record the number of zeros per test
test_index = which(sample_info_dg$ko_group%in%c(paste0(geno, "_", k_grp1), "WT_WT"))
cts_dat_m_dg_test = cts_dat_m_dg[, test_index]
n_zero = rowSums(cts_dat_m_dg_test==0)
res_g1$n_zeros = n_zero
# record the number of samples involved in the comparison
test_ko_group_vec = sample_info_dg$ko_group[test_index]
df_size = rbind(df_size,
c(d_group,
paste0(geno, "_", k_grp1),
sum(test_ko_group_vec!="WT_WT"),
sum(test_ko_group_vec=="WT_WT")))
# prepare a processed version of output
res_g1_reduced = res_g1[, c("log2FoldChange", "pvalue", "padj", "n_zeros")]
res_g1_reduced$padj[which(res_g1_reduced$n_zeros>=4)] = NA
if (sum(test_ko_group_vec!="WT_WT")==1){
res_g1_reduced$padj = NA
}
res_g1_reduced = res_g1_reduced[, c("log2FoldChange", "pvalue", "padj")]
res_geno[[k_grp1]] = res_g1_reduced
if (sum(test_ko_group_vec!="WT_WT")>1){
g1 = ggplot(res_g1_reduced %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(d_group, " ", geno, "_", k_grp1))
print(g1)
}
tag1 = sprintf("%s_%s_", geno, k_grp1)
colnames(res_g1) = paste0(tag1, colnames(res_g1))
colnames(res_g1_reduced) = paste0(tag1, colnames(res_g1_reduced))
if(is.null(res_geno_df)){
res_geno_df = res_g1
}else if(!is.null(res_geno_df)){
stopifnot(all(rownames(res_geno_df) == rownames(res_g1)))
res_geno_df = cbind(res_geno_df, res_g1)
}
if(is.null(res_geno_reduced_df)){
res_geno_reduced_df = res_g1_reduced
}else if(!is.null(res_geno_reduced_df)){
stopifnot(all(rownames(res_geno_reduced_df) == rownames(res_g1_reduced)))
res_geno_reduced_df = cbind(res_geno_reduced_df, res_g1_reduced)
}
}
get_index <- function(x){
which(x$padj < 0.05 & abs(x$log2FoldChange) > log2(1.5))
}
# make an exception for the case of
# d_group = "ExM_20231004_23-robson-011"
# geno = "MEIS2"
if ((d_group=="ExM_day_6_20231004_23-robson-011")&(geno=="MEIS2")){
w_ce = get_index(res_geno[["CE"]])
w_ptc = get_index(res_geno[["PTC"]])
print(paste0("DE group ", d_group, " under KO gene ", geno, ":"))
print("There is no result for knock strategy KO due to number of knockout samples being only 1.")
print(paste0("number of DE genes from knock out strategy CE: ",
as.character(length(w_ce))))
print(paste0("number of DE genes from knock out strategy PTC: ",
as.character(length(w_ptc))))
ptc_ce_overlap = length(intersect(rownames(res_geno[["PTC"]])[w_ptc],
rownames(res_geno[["CE"]])[w_ce]))
print(paste0("number of common DE genes by knock out strategies PTC and CE: ",
as.character(ptc_ce_overlap)))
m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce],
"PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
g_up = UpSet(m)
print(g_up)
df1 = data.frame(padj_CE = res_geno[["CE"]]$padj,
padj_PTC = res_geno[["PTC"]]$padj)
cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC),
y = -log10(padj_CE))) +
geom_pointdensity() +
ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr1)) +
scale_color_viridis()
print(g4)
}else{
w_ce = get_index(res_geno[["CE"]])
w_ko = get_index(res_geno[["KO"]])
w_ptc = get_index(res_geno[["PTC"]])
print(paste0("DE group ", d_group, " under KO gene ", geno, ":"))
print(paste0("number of DE genes from knock out strategy CE: ",
as.character(length(w_ce))))
print(paste0("number of DE genes from knock out strategy KO: ",
as.character(length(w_ko))))
print(paste0("number of DE genes from knock out strategy PTC: ",
as.character(length(w_ptc))))
ce_ko_overlap = length(intersect(rownames(res_geno[["CE"]])[w_ce],
rownames(res_geno[["KO"]])[w_ko]))
ko_ptc_overlap = length(intersect(rownames(res_geno[["KO"]])[w_ko],
rownames(res_geno[["PTC"]])[w_ptc]))
ptc_ce_overlap = length(intersect(rownames(res_geno[["PTC"]])[w_ptc],
rownames(res_geno[["CE"]])[w_ce]))
print(paste0("number of common DE genes by knock out strategies CE and KO: ",
as.character(ce_ko_overlap)))
print(paste0("number of common DE genes by knock out strategies KO and PTC: ",
as.character(ko_ptc_overlap)))
print(paste0("number of common DE genes by knock out strategies PTC and CE: ",
as.character(ptc_ce_overlap)))
m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce],
"KO" = rownames(res_geno[["KO"]])[w_ko],
"PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
g_up = UpSet(m)
print(g_up)
df1 = data.frame(padj_CE = res_geno[["CE"]]$padj,
padj_KO = res_geno[["KO"]]$padj,
padj_PTC = res_geno[["PTC"]]$padj)
cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
cr2 = cor(df1$padj_PTC, df1$padj_KO, method = "spearman", use="pair")
g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC),
y = -log10(padj_CE))) +
geom_pointdensity() + ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr1)) +
scale_color_viridis()
g5 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC),
y = -log10(padj_KO))) +
geom_pointdensity() + ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr2)) +
scale_color_viridis()
print(g4)
print(g5)
}
dim(res_geno_df)
res_geno_df[1:2,]
dim(res_geno_reduced_df)
res_geno_reduced_df[1:2,]
res_df = data.frame(gene_ID=rownames(res_geno_df),
res_geno_df)
dim(res_df)
res_df[1:2,]
res_reduced_gene_anno = gene_anno[match(rownames(res_geno_reduced_df), gene_anno$ensembl_gene_id), ]
stopifnot(all(rownames(res_geno_reduced_df)==res_reduced_gene_anno$ensembl_gene_id))
# set gene hgnc_symbol that equal "" to NA
hgnc_symbol_vec = res_reduced_gene_anno$hgnc_symbol
hgnc_symbol_vec[which(hgnc_symbol_vec=="")] = NA
res_reduced_df = data.frame(gene_ID=rownames(res_geno_reduced_df),
hgnc_symbol=hgnc_symbol_vec,
chr=res_reduced_gene_anno$chr,
strand=res_reduced_gene_anno$strand,
start=res_reduced_gene_anno$start,
end=res_reduced_gene_anno$end,
res_geno_reduced_df)
print("hgnc_symbol empty string and NA tables:")
print(table(res_reduced_df$hgnc_symbol=="", useNA="ifany"))
print(table(is.na(res_reduced_df$hgnc_symbol)))
dim(res_reduced_df)
res_reduced_df[1:2,]
setting_name = unique(sample_info_dg$model_day)
fwrite(res_df,
sprintf("%s/%s_%s_%s_DEseq2_raw.tsv",
raw_output_dir, release_name, setting_name, geno),
sep="\t")
fwrite(res_reduced_df,
sprintf("%s/%s_%s_%s_DEseq2.tsv",
processed_output_dir, release_name, setting_name, geno),
sep="\t")
}
}
}## [1] "ExM"
##
## MEF2C_CE MEF2C_KO MEF2C_PTC MEF2C_WT1 MEF2C_WT2 MEF2C_WT3 MEIS1_CE
## CE 3 0 0 0 0 0 3
## KO 0 3 0 0 0 0 0
## PTC 0 0 3 0 0 0 0
## WT 0 0 0 1 1 1 0
##
## MEIS1_KO MEIS1_PTC MEIS1_WT MEIS2_CE MEIS2_KO MEIS2_PTC MEIS2_WT
## CE 0 0 0 3 0 0 0
## KO 3 0 0 0 1 0 0
## PTC 0 3 0 0 0 3 0
## WT 0 0 3 0 0 0 3
##
## TRUE
## 34
##
## MEF2C MEIS1 MEIS2 WT
## CE 3 3 3 0
## KO 3 3 1 0
## PTC 3 3 3 0
## WT 0 0 0 9
##
## MEF2C_CE MEF2C_KO MEF2C_PTC MEIS1_CE MEIS1_KO MEIS1_PTC MEIS2_CE MEIS2_KO
## 3 3 3 3 3 3 3 1
## MEIS2_PTC WT_WT
## 3 9
## [1] 10
##
## MEF2C_CE MEF2C_KO MEF2C_PTC MEIS1_CE MEIS1_KO
## 20231004_23-robson-011 0 0 0 3 3
## 20231124_23-robson-013 3 3 3 0 0
##
## MEIS1_PTC MEIS2_CE MEIS2_KO MEIS2_PTC WT_WT
## 20231004_23-robson-011 3 3 1 3 6
## 20231124_23-robson-013 0 0 0 0 3
## [1] "-----------------------------------"
## [1] "DE analysis group: ExM_day_6_20231004_23-robson-011"
## [1] "DE analysis group ExM_day_6_20231004_23-robson-011 DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25641 to 23897"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 35.51818 secs
## [1] "prop of non-null for rd: 2.32e-01"
## [1] "DE group ExM_day_6_20231004_23-robson-011 under KO gene MEIS1:"
## [1] "number of DE genes from knock out strategy CE: 61"
## [1] "number of DE genes from knock out strategy KO: 76"
## [1] "number of DE genes from knock out strategy PTC: 50"
## [1] "number of common DE genes by knock out strategies CE and KO: 31"
## [1] "number of common DE genes by knock out strategies KO and PTC: 30"
## [1] "number of common DE genes by knock out strategies PTC and CE: 29"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18953 4944
##
## FALSE TRUE
## 18953 4944
## [1] "DE group ExM_day_6_20231004_23-robson-011 under KO gene MEIS2:"
## [1] "There is no result for knock strategy KO due to number of knockout samples being only 1."
## [1] "number of DE genes from knock out strategy CE: 44"
## [1] "number of DE genes from knock out strategy PTC: 176"
## [1] "number of common DE genes by knock out strategies PTC and CE: 40"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18953 4944
##
## FALSE TRUE
## 18953 4944
## [1] "-----------------------------------"
## [1] "DE analysis group: ExM_day_6_20231124_23-robson-013"
## [1] "DE analysis group ExM_day_6_20231124_23-robson-013 DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25641 to 24649"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 24649 to 24060"
## Time difference of 11.42949 secs
## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## Time difference of 8.686329 secs
## [1] "DE group ExM_day_6_20231124_23-robson-013 under KO gene MEF2C:"
## [1] "number of DE genes from knock out strategy CE: 0"
## [1] "number of DE genes from knock out strategy KO: 1"
## [1] "number of DE genes from knock out strategy PTC: 0"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18994 5066
##
## FALSE TRUE
## 18994 5066
## [1] "PrS"
##
## BMLHE40_CE BMLHE40_KO BMLHE40_PTC BMLHE40_WT MDX1_CE MDX1_KO MDX1_KOLF2
## CE 3 0 0 0 3 0 0
## KO 0 3 0 0 0 3 0
## PTC 0 0 3 0 0 0 0
## WT 0 0 0 3 0 0 3
##
## MDX1_PTC NC0A3_CE NC0A3_KO NC0A3_PTC NC0A3_WT RUNX1_CE RUNX1_KO RUNX1_PTC
## CE 0 3 0 0 0 3 0 0
## KO 0 0 3 0 0 0 3 0
## PTC 3 0 0 3 0 0 0 3
## WT 0 0 0 0 3 0 0 0
##
## RUNX1_WT1 RUNX1_WT2 RUNX1_WT3
## CE 0 0 0
## KO 0 0 0
## PTC 0 0 0
## WT 1 1 1
##
## TRUE
## 48
##
## BHLHE40 MXD1 NCOA3 RUNX1 WT
## CE 3 3 3 3 0
## KO 3 3 3 3 0
## PTC 3 3 3 3 0
## WT 0 0 0 0 12
##
## BHLHE40_CE BHLHE40_KO BHLHE40_PTC MXD1_CE MXD1_KO MXD1_PTC
## 3 3 3 3 3 3
## NCOA3_CE NCOA3_KO NCOA3_PTC RUNX1_CE RUNX1_KO RUNX1_PTC
## 3 3 3 3 3 3
## WT_WT
## 12
## [1] 13
##
## BHLHE40_CE BHLHE40_KO BHLHE40_PTC MXD1_CE MXD1_KO
## 20230809_23-robson-007-run2 0 0 0 3 3
## 20231124_23-robson-013 0 0 0 0 0
## 20240307_24-robson-002 3 3 3 0 0
## 20240307_24-robson-005 0 0 0 0 0
##
## MXD1_PTC NCOA3_CE NCOA3_KO NCOA3_PTC RUNX1_CE
## 20230809_23-robson-007-run2 3 0 0 0 0
## 20231124_23-robson-013 0 0 0 0 3
## 20240307_24-robson-002 0 0 0 0 0
## 20240307_24-robson-005 0 3 3 3 0
##
## RUNX1_KO RUNX1_PTC WT_WT
## 20230809_23-robson-007-run2 0 0 3
## 20231124_23-robson-013 3 3 3
## 20240307_24-robson-002 0 0 3
## 20240307_24-robson-005 0 0 3
## [1] "-----------------------------------"
## [1] "DE analysis group: PrS_day_6"
## [1] "DE analysis group PrS_day_6 DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25641 to 23756"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 43.41503 secs
## [1] "prop of non-null for rd: 5.05e-01"
## [1] "DE group PrS_day_6 under KO gene BHLHE40:"
## [1] "number of DE genes from knock out strategy CE: 429"
## [1] "number of DE genes from knock out strategy KO: 412"
## [1] "number of DE genes from knock out strategy PTC: 739"
## [1] "number of common DE genes by knock out strategies CE and KO: 298"
## [1] "number of common DE genes by knock out strategies KO and PTC: 337"
## [1] "number of common DE genes by knock out strategies PTC and CE: 320"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18951 4805
##
## FALSE TRUE
## 18951 4805
## [1] "DE group PrS_day_6 under KO gene MXD1:"
## [1] "number of DE genes from knock out strategy CE: 120"
## [1] "number of DE genes from knock out strategy KO: 344"
## [1] "number of DE genes from knock out strategy PTC: 90"
## [1] "number of common DE genes by knock out strategies CE and KO: 37"
## [1] "number of common DE genes by knock out strategies KO and PTC: 39"
## [1] "number of common DE genes by knock out strategies PTC and CE: 37"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18951 4805
##
## FALSE TRUE
## 18951 4805
## [1] "DE group PrS_day_6 under KO gene NCOA3:"
## [1] "number of DE genes from knock out strategy CE: 29"
## [1] "number of DE genes from knock out strategy KO: 107"
## [1] "number of DE genes from knock out strategy PTC: 1"
## [1] "number of common DE genes by knock out strategies CE and KO: 17"
## [1] "number of common DE genes by knock out strategies KO and PTC: 1"
## [1] "number of common DE genes by knock out strategies PTC and CE: 1"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18951 4805
##
## FALSE TRUE
## 18951 4805
## [1] "DE group PrS_day_6 under KO gene RUNX1:"
## [1] "number of DE genes from knock out strategy CE: 1941"
## [1] "number of DE genes from knock out strategy KO: 351"
## [1] "number of DE genes from knock out strategy PTC: 2015"
## [1] "number of common DE genes by knock out strategies CE and KO: 173"
## [1] "number of common DE genes by knock out strategies KO and PTC: 188"
## [1] "number of common DE genes by knock out strategies PTC and CE: 1098"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18951 4805
##
## FALSE TRUE
## 18951 4805
Save the size dataframe out
colnames(df_size) = c("DE_group", "knockout_gene_strategy", "n_ko", "n_WT")
df_size = as.data.frame(df_size)
multi_rows = which(table(df_size$knockout_gene_strategy)>1)
multi_rows## named integer(0)
df_size[which(df_size$knockout_gene_strategy%in%names(multi_rows)),]## [1] DE_group knockout_gene_strategy n_ko
## [4] n_WT
## <0 rows> (or 0-length row.names)
fwrite(df_size,
sprintf("%s/%s_DE_n_samples.tsv", output_dir, release_name),
sep="\t")gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 8132310 434.4 12621687 674.1 NA 12621687 674.1
## Vcells 35020614 267.2 79066021 603.3 65536 79066021 603.3
sessionInfo()## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] readxl_1.4.3 UpSetR_1.4.0
## [3] ComplexHeatmap_2.14.0 data.table_1.15.4
## [5] dplyr_1.1.4 ggpointdensity_0.1.0
## [7] viridis_0.6.5 viridisLite_0.4.2
## [9] DESeq2_1.38.3 SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0 MatrixGenerics_1.10.0
## [13] matrixStats_1.3.0 GenomicRanges_1.50.2
## [15] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [17] S4Vectors_0.36.2 BiocGenerics_0.44.0
## [19] RColorBrewer_1.1-3 MASS_7.3-60
## [21] stringr_1.5.1 ggpubr_0.6.0
## [23] ggrepel_0.9.5 ggplot2_3.5.1
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-8 bit64_4.0.5 doParallel_1.0.17
## [4] httr_1.4.7 tools_4.2.3 backports_1.5.0
## [7] bslib_0.8.0 utf8_1.2.4 R6_2.5.1
## [10] DBI_1.2.3 colorspace_2.1-1 GetoptLong_1.0.5
## [13] withr_3.0.1 tidyselect_1.2.1 gridExtra_2.3
## [16] bit_4.0.5 compiler_4.2.3 cli_3.6.3
## [19] Cairo_1.6-2 DelayedArray_0.24.0 labeling_0.4.3
## [22] sass_0.4.9 scales_1.3.0 digest_0.6.37
## [25] rmarkdown_2.28 XVector_0.38.0 pkgconfig_2.0.3
## [28] htmltools_0.5.8.1 highr_0.11 fastmap_1.2.0
## [31] GlobalOptions_0.1.2 rlang_1.1.4 rstudioapi_0.16.0
## [34] RSQLite_2.3.7 farver_2.1.2 shape_1.4.6.1
## [37] jquerylib_0.1.4 generics_0.1.3 jsonlite_1.8.8
## [40] BiocParallel_1.32.6 car_3.1-2 RCurl_1.98-1.16
## [43] magrittr_2.0.3 GenomeInfoDbData_1.2.9 Matrix_1.5-4.1
## [46] Rcpp_1.0.13 munsell_0.5.1 fansi_1.0.6
## [49] abind_1.4-5 lifecycle_1.0.4 stringi_1.8.4
## [52] yaml_2.3.10 carData_3.0-5 zlibbioc_1.44.0
## [55] plyr_1.8.9 blob_1.2.4 parallel_4.2.3
## [58] crayon_1.5.3 lattice_0.22-6 Biostrings_2.66.0
## [61] annotate_1.76.0 circlize_0.4.16 KEGGREST_1.38.0
## [64] locfit_1.5-9.10 knitr_1.48 pillar_1.9.0
## [67] rjson_0.2.21 ggsignif_0.6.4 geneplotter_1.76.0
## [70] codetools_0.2-20 XML_3.99-0.17 glue_1.7.0
## [73] evaluate_0.24.0 foreach_1.5.2 vctrs_0.6.5
## [76] png_0.1-8 cellranger_1.1.0 gtable_0.3.5
## [79] purrr_1.0.2 tidyr_1.3.1 clue_0.3-65
## [82] cachem_1.1.0 xfun_0.47 xtable_1.8-4
## [85] broom_1.0.6 rstatix_0.7.2 tibble_3.2.1
## [88] iterators_1.0.14 AnnotationDbi_1.60.2 memoise_2.0.1
## [91] cluster_2.1.6